##Einlesen der Tabellen und umwandeln der Tabellen in data frames
library(readxl)
## Warning: Paket 'readxl' wurde unter R Version 4.3.1 erstellt
library(dplyr)
##
## Attache Paket: 'dplyr'
## Die folgenden Objekte sind maskiert von 'package:stats':
##
## filter, lag
## Die folgenden Objekte sind maskiert von 'package:base':
##
## intersect, setdiff, setequal, union
Endung <- "_dengue_extracted.xlsx"
Anfang <- "Dengue"
for (i in 2006:2020) {
Dateiname <- paste0(i, Endung) #erstellen das Dateinamens
Name_df <- paste0(Anfang, i)
assign(Name_df, read_excel(Dateiname)) #erstellen der Variable und zuweisen der Excel-Tabelle
df_dengue <- get(Name_df)
vec_data_names <- vector("character")
vec_data_names <- sapply(df_dengue$Reporting_areas, function(g) {
paste(g[1], i)
})
data_names <- matrix(vec_data_names,nrow = 77, ncol = 1)
df_dengue <- cbind(data_names, df_dengue)
assign(Name_df,df_dengue)
}
total_years = data.frame()
for (i in 2006:2020){
a = get(paste0("Dengue",i))
total_years = rbind(total_years, a)
}
#Rows als Zeitachse mit Monaten
m = 1
data_total = data.frame(col1 = character(0), col2 = character(0), col3 = numeric(0), month = character(0))
colnames(data_total) = c("data_names", "Reporting_areas", "dengue_cases", "month")
sorted_total_years = total_years[order(total_years$"Reporting_areas"),]
for (k in 1:1155){
for (j in 3:14){
square <- sorted_total_years[k, c(1, 2, j)]
month = colnames(square[3])
square <- cbind(square,month)
colnames(square) = c("data_names", "Reporting_areas", "dengue_cases", "month")
data_total <- rbind(data_total, square)
colnames(data_total) = c("data_names", "Reporting_areas", "dengue_cases", "month")
data_total[m,1] = paste(data_total[m,1], month[1])
m = m + 1
}
}
#adding temperature, longitude and latitude column
m = 1
temp_data = read_excel("era5_data_2006_2020_thailand_monmean.xlsx")
temp_data = as.data.frame(temp_data)
temp_data_sorted = data.frame( col1 = numeric(0), Longitude = numeric(0), Latitude = numeric(0))
for (i in 1:77) {
for (j in 5:184) {
temp_data_sorted[m,1] = temp_data[i,j]
temp_data_sorted[m,2] = temp_data[i,3]
temp_data_sorted[m,3] = temp_data[i,4]
m = m + 1
}
}
data_total = cbind(data_total, temp_data_sorted)
colnames(data_total)[5] = "temperature"
#Einlesen Population data
library(readxl)
Endung <- "_population.xlsx"
#empty dataframe for the data
total_population = data.frame(Reporting_areas = character(), population_count = numeric())
# stack years of 2006-2011 below each other
for (i in 2006:2011) {
population_df = data.frame()
Dateiname <- paste0(i, Endung) #erstellen das Dateinamens
population_df = read_excel(Dateiname) #erstellen der Variable und zuweisen der Excel-Tabelle
vec_years = rep(i, times = 77)
population_df = cbind(population_df,vec_years)
names(population_df) = c("Reporting_areas", "population_count", "year")
# multiply every row 12 times for each month
repeat_count <- 12
population_df_replicated <- population_df[rep(row.names(population_df), each = repeat_count), ]
row.names(population_df_replicated) <- NULL
#put all the years together
total_population = rbind(total_population, population_df_replicated)
}
# stack years of 2012-2020 below each other and add to data_total
population_df = read_excel("2012-2020_population.xlsx")
colnames(population_df) = c("Reporting_areas", "population_count", "population_count", "population_count", "population_count", "population_count","population_count","population_count","population_count","population_count")
pop_placeholder = data.frame(Reporting_areas = character(), population_count = numeric(), year = numeric())
col_num = 2
for (i in 2012:2020) {
year = rep(i, times = 77)
pop_placeholder = rbind(pop_placeholder, cbind(population_df[,c(1,col_num)],year))
names(pop_placeholder) = c("Reporting_areas", "population_count", "year")
col_num = col_num + 1
}
# multiply every row 12 times for each month
repeat_count <- 12
population_df_replicated <- pop_placeholder[rep(row.names(pop_placeholder), each = repeat_count), ]
row.names(population_df_replicated) <- NULL
# combine all population data in each
total_population = rbind(total_population, population_df_replicated)
sorted_total_population = total_population[order(total_population$"Reporting_areas"),]
data_total = cbind(data_total,sorted_total_population[,c(2,3)])
row.names(data_total) = NULL
first_date = as.Date("2006-01-01")
last_date = as.Date("2020-12-31")
time_column_short = seq(first_date, last_date, by = "month")
time_column_short = as.Date(time_column_short)
time_column = rep(time_column_short, times = 77)
data_total = cbind(data_total, time_column)
#data_total = subset(data_total, select = -time_column)
#Detect missing values
detect_na <- function(x) {
is_na <- x %in% c("N/A", "na", "NA", "n/a")
replace(x, is_na, NA)
}
data_total <- apply(data_total, 2, detect_na)
rmv.rows = apply(data_total,1,function(x){sum(is.na(x))})
i.missing = which(rmv.rows >0)
#Inzindenz
data_total = as.data.frame(data_total)
data_total$dengue_cases = as.integer(data_total$dengue_cases)
## Warning: NAs durch Umwandlung erzeugt
data_total$population_count = as.integer(data_total$population_count)
dimensions = dim(data_total)
R= dimensions[1]
for(Zeile in 1:R){
Population <- data_total$population_count[Zeile]
Infection <- data_total$dengue_cases[Zeile]
Incidence = (Infection/Population)*100000
data_total$incidence[Zeile] <- Incidence
}
#create clean data frame
detect_na = function(x) {
is_na = x %in% c("N/A", "na", "NA", "n/a")
replace(x, is_na, NA)
}
data_total = apply(data_total, 2, detect_na)
rmv.rows = apply(data_total,1,function(x){sum(is.na(x))})
i.missing = which(rmv.rows >0)
clean_data_total = as.data.frame(data_total[-i.missing,])
num_col = c("temperature", "Longitude", "Latitude", "incidence")
int_col = c("dengue_cases","population_count","year")
date_col = c("time_column")
clean_data_total[int_col] = lapply(clean_data_total[int_col], as.integer)
clean_data_total[num_col] = lapply(clean_data_total[num_col], as.numeric)
clean_data_total[date_col] = lapply(clean_data_total[date_col], as.Date)
summary(clean_data_total)
## data_names Reporting_areas dengue_cases month
## Length:13727 Length:13727 Min. : 0.00 Length:13727
## Class :character Class :character 1st Qu.: 16.00 Class :character
## Mode :character Mode :character Median : 43.00 Mode :character
## Mean : 98.19
## 3rd Qu.: 103.00
## Max. :8324.00
## temperature Longitude Latitude population_count
## Min. :17.67 Min. : 98.03 Min. : 6.178 Min. : 175121
## 1st Qu.:26.13 1st Qu.: 99.79 1st Qu.:13.195 1st Qu.: 467476
## Median :27.23 Median :100.55 Median :14.857 Median : 639063
## Mean :27.05 Mean :101.01 Mean :14.203 Mean : 841759
## 3rd Qu.:28.35 3rd Qu.:102.13 3rd Qu.:16.628 3rd Qu.:1062699
## Max. :34.43 Max. :105.11 Max. :19.848 Max. :5716248
## year time_column incidence
## Min. :2006 Min. :2006-01-01 Min. : 0.000
## 1st Qu.:2009 1st Qu.:2009-11-01 1st Qu.: 2.753
## Median :2013 Median :2013-07-01 Median : 6.652
## Mean :2013 Mean :2013-07-05 Mean : 11.274
## 3rd Qu.:2017 3rd Qu.:2017-04-01 3rd Qu.: 13.947
## Max. :2020 Max. :2020-12-01 Max. :370.477
#struture data frame and convert into right datatypes
data_total = as.data.frame(data_total)
num_col = c("temperature", "Longitude", "Latitude", "incidence")
int_col = c("dengue_cases","population_count","year")
date_col = c("time_column")
data_total[int_col] = lapply(data_total[int_col], as.integer)
data_total[num_col] = lapply(data_total[num_col], as.numeric)
data_total[date_col] = lapply(data_total[date_col], as.Date)
data_total = subset(data_total, select = c("data_names", "Reporting_areas", "year", "month", "time_column", "dengue_cases", "population_count", "temperature", "Longitude", "Latitude", "incidence"))
save(data_total, file = "data_total.RData")
summary(data_total)
## data_names Reporting_areas year month
## Length:13860 Length:13860 Min. :2006 Length:13860
## Class :character Class :character 1st Qu.:2009 Class :character
## Mode :character Mode :character Median :2013 Mode :character
## Mean :2013
## 3rd Qu.:2017
## Max. :2020
##
## time_column dengue_cases population_count temperature
## Min. :2006-01-01 Min. : 0.00 Min. : 175121 Min. :17.67
## 1st Qu.:2009-09-23 1st Qu.: 16.00 1st Qu.: 467411 1st Qu.:26.10
## Median :2013-06-16 Median : 43.00 Median : 638660 Median :27.22
## Mean :2013-06-16 Mean : 98.19 Mean : 840182 Mean :27.03
## 3rd Qu.:2017-03-08 3rd Qu.: 103.00 3rd Qu.:1060272 3rd Qu.:28.33
## Max. :2020-12-01 Max. :8324.00 Max. :5716248 Max. :34.43
## NA's :133 NA's :60
## Longitude Latitude incidence
## Min. : 98.03 Min. : 6.178 Min. : 0.000
## 1st Qu.: 99.79 1st Qu.:13.195 1st Qu.: 2.753
## Median :100.55 Median :14.857 Median : 6.652
## Mean :101.03 Mean :14.231 Mean : 11.274
## 3rd Qu.:102.13 3rd Qu.:16.628 3rd Qu.: 13.947
## Max. :105.11 Max. :19.848 Max. :370.477
## NA's :133
for (i in 1:dim(data_total)[1]) {
if(data_total$Reporting_areas[i]=="Bungkan"){
Datum = data_total$time_column[i]
for(j in 1:dim(data_total)[1]){
if(data_total$Reporting_areas[j] == "Nong Khai" & data_total$time_column[j] == Datum){
data_total$dengue_cases[j]= ifelse(is.na(data_total$dengue_cases[i]), data_total$dengue_cases[j], data_total$dengue_cases[i]+data_total$dengue_cases[j])
data_total$population_count[j]= ifelse(is.na(data_total$population_count[i]), data_total$population_count[j], data_total$population_count[i]+data_total$population_count[j])
data_total$incidence[j]= ifelse(is.na(data_total$incidence[i]), data_total$incidence[j], data_total$incidence[i]+data_total$incidence[j])
}
}
}
}
#set values of Bungkan equal to Nong Khai for mapping
map_data_total = data_total
for (k in 1:dim(data_total)[1]){
Datum = data_total$time_column[k]
if (data_total$Reporting_areas[k]=="Bungkan") {
for(p in 1:dim(data_total)[1]){
if(data_total$Reporting_areas[p] == "Nong Khai" & data_total$time_column[p] == Datum){
map_data_total[k,c("temperature", "incidence", "dengue_cases")] = data_total[p,c("temperature", "incidence", "dengue_cases")]
}
}
}
}
data_total = data_total[data_total$Reporting_areas != "Bungkan", ]
row.names(data_total) = NULL
#create data frame for monsoon season
vec_month_monsoon = c("Jun", "Jul", "Aug", "Sep")
data_monsoon = data_total[data_total$month %in% vec_month_monsoon,]
rmv.rows.m = apply(data_monsoon,1,function(x){sum(is.na(x))})
i.missing.m = which(rmv.rows.m >0)
clean_data_monsoon = as.data.frame(data_monsoon[-i.missing.m,])
month_vec = c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec" )
total_thailand = data.frame(data_names = character(), dengue_cases = numeric())
for (i in 2006:2020) {
name_years = paste0("rows_", i)
assign(name_years, data_total[data_total$year == i, ])
for (month in month_vec) {
name_months = paste0(name_years, month)
a = get(name_years)
assign(name_months, a[a$month == month, ])
b = get(name_months)
total_cases = sum(b$dengue_cases, na.rm = T)
name_column = paste0(i,"_", month)
placeholder_df = data.frame(data_names = name_column, dengue_cases = total_cases)
total_thailand = rbind(total_thailand, placeholder_df)
}
}
total_thailand = cbind(total_thailand, time_column_short)
#max, min, sd
apply(data_total[,c(num_col, int_col)], 2, function(x){max(x,na.rm=TRUE)})
## temperature Longitude Latitude incidence
## 3.443087e+01 1.051115e+02 1.984765e+01 3.704772e+02
## dengue_cases population_count year
## 8.324000e+03 5.716248e+06 2.020000e+03
apply(data_total[,c(num_col, int_col)], 2, function(x){min(x,na.rm=TRUE)})
## temperature Longitude Latitude incidence
## 1.766781e+01 9.803039e+01 6.177542e+00 0.000000e+00
## dengue_cases population_count year
## 0.000000e+00 1.751210e+05 2.006000e+03
apply(data_total[,c(num_col, int_col)], 2, function(x){mean(x,na.rm=TRUE)})
## temperature Longitude Latitude incidence
## 27.03971 100.99276 14.17976 11.37365
## dengue_cases population_count year
## 99.06063 847552.03596 2013.00000
apply(data_total[,c(num_col, int_col)], 2, function(x){median(x,na.rm=TRUE)})
## temperature Longitude Latitude incidence
## 2.722116e+01 1.005451e+02 1.483826e+01 6.719759e+00
## dengue_cases population_count year
## 4.300000e+01 6.612255e+05 2.013000e+03
apply(data_total[,c(num_col, int_col)], 2, function(x){sd(x,na.rm=TRUE)})
## temperature Longitude Latitude incidence
## 2.117061e+00 1.717002e+00 3.478485e+00 1.571335e+01
## dengue_cases population_count year
## 2.035782e+02 7.219595e+05 4.320652e+00
#percentage in the monsoon season
#percentage monsoon cases verses over all cases
monsoon_perc = sum(data_monsoon$incidence, na.rm = T)/sum(data_total$incidence, na.rm = T)
monsoon_perc
## [1] 0.5601323
#average temperature in monsoon vs over all
mean(data_monsoon$temperature, na.rm = T)
## [1] 27.68275
mean(data_total$temperature, na.rm = T)
## [1] 27.03971
data_not_monsoon = data_total[!(data_total$month %in% c("Jun", "Jul", "Aug", "Sep")),]
mean(data_not_monsoon$temperature, na.rm = T)
## [1] 26.71819
#mean of dengue_cases and temperature of whole thailand per month
library(ggplot2)
## Warning: Paket 'ggplot2' wurde unter R Version 4.3.1 erstellt
library(scales)
## Warning: Paket 'scales' wurde unter R Version 4.3.1 erstellt
#dengue cases of whole thailand per month
f_mean_dengue = function(){
mean_dengue <- data.frame(dengue_cases = numeric(), time_column = as.Date(character()))
months = seq(from = as.Date("2006-01-01"), to = as.Date("2020-12-01"), by = "month")
for (i in months) {
month = data_total[data_total$time_column == i,]
m_month = data.frame(dengue_cases = mean(month$dengue_cases, na.rm = TRUE), time_column = month$time_column[1])
mean_dengue = rbind(mean_dengue, m_month)
}
return(mean_dengue)
}
f_mean_temp = function(){
mean_temp <- data.frame(temperature = numeric(), time_column = as.Date(character()))
months = seq(from = as.Date("2006-01-01"), to = as.Date("2020-12-01"), by = "month")
for (i in months) {
month = data_total[data_total$time_column == i,]
m_month = data.frame(temperature = mean(month$temperature, na.rm = TRUE), time_column = month$time_column[1])
mean_temp = rbind(mean_temp, m_month)
}
return(mean_temp)
}
plot1 = ggplot2::ggplot() +
geom_line(data = f_mean_dengue(), aes(x = time_column, y = dengue_cases)) +
labs(x = "time", y = "dengue-cases") +
scale_x_continuous(breaks = seq(from = as.Date("2006-01-01"), to = as.Date("2020-12-01"), by = "2 year"))
plot2 = ggplot2::ggplot() +
geom_line(data = f_mean_temp(), aes(x = time_column, y = temperature), color = "red") +
labs(x = "time", y = "temperature") +
scale_x_continuous(breaks = seq(from = as.Date("2006-01-01"), to = as.Date("2020-12-01"), by = "2 year"))
gridExtra::grid.arrange(plot1, plot2, nrow = 2)
#mean and median comparison
library(ggplot2)
f_median_dengue = function(){
median_dengue <- data.frame(dengue_cases = numeric(), time_column = as.Date(character()))
months = seq(from = as.Date("2006-01-01"), to = as.Date("2020-12-01"), by = "month")
for (i in months) {
month = data_total[data_total$time_column == i,]
m_month = data.frame(dengue_cases = median(month$dengue_case, na.rm = TRUE), time_column = month$time_column[1])
median_dengue = rbind(median_dengue, m_month)
}
return(median_dengue)
}
f_median_temp = function(){
median_temp <- data.frame(temperature = numeric(), time_column = as.Date(character()))
months = seq(from = as.Date("2006-01-01"), to = as.Date("2020-12-01"), by = "month")
for (i in months) {
month = data_total[data_total$time_column == i,]
m_month = data.frame(temperature = median(month$temperature, na.rm = TRUE), time_column = month$time_column[1])
median_temp = rbind(median_temp, m_month)
}
return(median_temp)
}
#compute difference between mean and median
dif_mm_dengue = f_mean_dengue()["dengue_cases"]-f_median_dengue()["dengue_cases"]
dif_mm_dengue = cbind(dif_mm_dengue,f_mean_dengue()["time_column"])
dif_mm_temp = f_mean_temp()["temperature"]-f_median_temp()["temperature"]
dif_mm_temp = cbind(dif_mm_temp,f_mean_temp()["time_column"])
plot_mean_temp = ggplot2::ggplot() +
geom_line(data = f_mean_temp(), aes(x = time_column, y = temperature)) +
labs(x = "time", y = "temperature", title = "Mean temperature")
plot_mean_dengue = ggplot2::ggplot() +
geom_line(data = f_mean_dengue(), aes(x = time_column, y = dengue_cases)) +
labs(x = "time", y = "dengue-cases", title = "Mean dengue cases")
plot_median_temp = ggplot2::ggplot() +
geom_line(data = f_median_temp(), aes(x = time_column, y = temperature)) +
labs(x = "time", y = "temperature", title = "Median temperature")
plot_median_dengue = ggplot2::ggplot() +
geom_line(data = f_median_dengue(), aes(x = time_column, y = dengue_cases)) +
labs(x = "time", y = "dengue-cases", title = "Median dengue cases")
plot_dif_dengue = ggplot2::ggplot() +
geom_line(data = dif_mm_dengue, aes(x = time_column, y = dengue_cases)) +
labs(x = "time", y = "dengue-cases", title = "Difference between mean and median")
plot_dif_temp = ggplot2::ggplot() +
geom_line(data = dif_mm_temp, aes(x = time_column, y = temperature)) +
labs(x = "time", y = "temperature", title = "Difference between mean and median")
gridExtra::grid.arrange(plot_mean_dengue, plot_mean_temp, plot_median_dengue, plot_median_temp,plot_dif_dengue,plot_dif_temp, nrow = 3)
#mean of dengue_cases and temperature of whole thailand per year
f_mean_dengue.y.t = function(){
mean_dengue.y.t <- data.frame(dengue_cases = numeric(), time_column = as.Date(character()))
for (i in c(2006:2020)) {
month = data_total[data_total$year == i,]
m_month = data.frame(dengue_cases = mean(month$dengue_cases, na.rm = TRUE), time_column = month$time_column[1])
mean_dengue.y.t = rbind(mean_dengue.y.t, m_month)
}
return(mean_dengue.y.t)
}
mean_dengue.y.t = as.data.frame(f_mean_dengue.y.t())
f_mean_temp.y.t = function(){
mean_temp.y.t <- data.frame(temperature = numeric(), time_column = as.Date(character()))
for (i in c(2006:2020)) {
month = data_total[data_total$year == i,]
m_month = data.frame(temperature = mean(month$temperature, na.rm = TRUE), time_column = month$time_column[1])
mean_temp.y.t = rbind(mean_temp.y.t, m_month)
}
return(mean_temp.y.t)
}
mean_temp.y.t = as.data.frame(f_mean_temp.y.t())
#mean of incidence and temperature of each provinze per year
f_mean_inc.y.p = function(period, areas){
mean_inc.y.p <- data.frame(incidence = numeric(), time_column = as.Date(character()), Reporting_areas = character())
for (i in period){
year = data_total[data_total$year == i,]
for (p in areas) {
provinz = year[year$Reporting_areas == p,]
m_y_p = data.frame(incidence = mean(provinz$incidence, na.rm = TRUE), time_column = provinz$time_column[1], Reporting_areas = provinz$Reporting_areas[1])
mean_inc.y.p = rbind(mean_inc.y.p, m_y_p)
}
}
return(mean_inc.y.p)
}
f_mean_temp.y.p = function(period, areas){
mean_temp.y.p <- data.frame(temperature = numeric(), time_column = as.Date(character()), Reporting_areas = character())
for (i in period){
year = data_total[data_total$year == i,]
for (p in areas) {
provinz = year[year$Reporting_areas == p,]
m_y_p = data.frame(temperature = mean(provinz$incidence, na.rm = TRUE), time_column = provinz$time_column[1], Reporting_areas = provinz$Reporting_areas[1])
mean_temp.y.p = rbind(mean_temp.y.p, m_y_p)
}
}
return(mean_temp.y.p)
}
#Plot für Bangkok
dengue_plot_cases = c()
dengue_plot_time = c()
dengue_plot_cases <- data_total$dengue_cases[data_total$Reporting_areas == "Bangkok"]
dengue_plot_time <- data_total$time_column[data_total$Reporting_areas == "Bangkok"]
plot(dengue_plot_time,dengue_plot_cases, type = "o")
#distribution of incidence
q.dengue = quantile(data_total$incidence, probs = c(0.01,0.1,0.25,0.5,0.75,0.9,0.99), na.rm = TRUE)
hist(data_total$incidence, breaks = 200, main = "Distribution of incidence", xlab = "incidence", freq = F);abline(v=q.dengue,lty=3,lwd=2,col='red')
lines(density(data_total$incidence, na.rm = T), col = "blue", lwd = 2)
q.temp = quantile(data_total$temperature, probs = c(0.01,0.1,0.25,0.5,0.75,0.9,0.99), na.rm = TRUE)
hist(data_total$temperature, breaks = 200, main = "Distribution of temperature", xlab = "temperature", freq = F);abline(v=q.temp,lty=3,lwd=2,col='red')
curve(dnorm(x, mean = mean(data_total$temperature, na.rm = TRUE), sd = sd(data_total$temperature, na.rm = TRUE)),
add = TRUE, col = "green", lwd = 2)
lines(density(data_total$temperature, na.rm = T), col = "blue", lwd = 2)
#correlation between temperature and incidence
library(npreg)
## Warning: Paket 'npreg' wurde unter R Version 4.3.1 erstellt
library(ggplot2)
smoothed_curve <- ss(clean_data_total$temperature, clean_data_total$incidence, nknots = 10)
plot(clean_data_total$temperature, clean_data_total$incidence, pch = 20,
ylab = "incidence",
xlab = "temperature",
abline(v = mean(clean_data_total$temperature), col = "red"))
plot(smoothed_curve, col = "blue")
plot(rank(data_total$temperature), rank(data_total$incidence), pch = 20,
ylab = "incidence",
xlab = "temperature",
abline(v = mean(data_total$temperature), col = "red"))
qqplot(data_total$temperature, data_total$incidence)
#pearson correlation y.t.
cor(mean_dengue.y.t$dengue_cases, mean_temp.y.t$temperature)
## [1] 0.3958178
#spearman correlation y.t.
cor(mean_dengue.y.t$dengue_cases, mean_temp.y.t$temperature, method = "spearman")
## [1] 0.2913317
#pearson correlation m.p.
cor(clean_data_total$dengue_cases, clean_data_total$temperature)
## [1] 0.1225488
#spearman correlation m.p.
cor(clean_data_total$dengue_cases, clean_data_total$temperature, method = "spearman")
## [1] 0.2593817
pheatmap::pheatmap(cor(clean_data_total[c("dengue_cases", "population_count", "temperature", "incidence")]))
#monsoon season smoothed curve incidence temperature
library(npreg)
library(ggplot2)
smoothed_curve_m <- ss(clean_data_monsoon$temperature, clean_data_monsoon$incidence, nknots = 10)
plot(clean_data_monsoon$temperature, clean_data_monsoon$incidence, pch = 20,
ylab = "incidence",
xlab = "temperature",
abline(v = mean(clean_data_monsoon$temperature), col = "red"))
plot(smoothed_curve, col = "blue")
#only for quantile 5% to 95%
lower_limit = quantile(clean_data_monsoon$temperature, probs = 0.05, na.rm = T)
upper_limit = quantile(clean_data_monsoon$temperature, probs = 0.95, na.rm = T)
clean_data_monsoon_5.95 = subset(clean_data_monsoon, temperature >= lower_limit & temperature <= upper_limit)
smoothed_curve_m_5.95 <- ss(clean_data_monsoon_5.95$temperature, clean_data_monsoon_5.95$incidence, nknots = 10)
plot(clean_data_monsoon_5.95$temperature, clean_data_monsoon_5.95$incidence, pch = 20,
ylab = "incidence",
xlab = "temperature",
abline(v = mean(clean_data_monsoon_5.95$temperature), col = "red"))
plot(smoothed_curve_m_5.95, col = "blue")
#correlation between different areas
#mean of temp and incidence over all years per area
t_data_total = t(data_total)
r_areas = unique(data_total$Reporting_areas)
m_inc_temp.y.r = data.frame(Reporting_areas = character(), incidence = numeric(), temperature = numeric())
for (i in r_areas){
temp_inc = apply(data_total[which(clean_data_total$Reporting_areas == i),c("incidence", "temperature")], 2,function(g){mean(g, na.rm = TRUE)})
temp_inc_df = data.frame(Reporting_areas = i, incidence = temp_inc["incidence"], temperature = temp_inc["temperature"])
m_inc_temp.y.r = rbind(m_inc_temp.y.r, temp_inc_df)
rownames(m_inc_temp.y.r) <- 1:nrow(m_inc_temp.y.r)
}
#k-means clustering
#elbow method
km = kmeans(m_inc_temp.y.r["incidence"], centers = 2, nstart = 10)
km$tot.withinss
## [1] 237.1706
wss = sapply(2:7,function(k) {
kmeans(m_inc_temp.y.r["incidence"], centers = k)$tot.withinss
})
plot(2:7,wss,type='b',pch=19,xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")
#silhoutte method
library(cluster)
## Warning: Paket 'cluster' wurde unter R Version 4.3.1 erstellt
#library (NbClust)
#library (clustertend)
library (factoextra)
## Warning: Paket 'factoextra' wurde unter R Version 4.3.1 erstellt
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
#library (fpc)
#library (clValid)
library(ggplot2)
#find optimal number of clusters by incidence
D_inc = dist(m_inc_temp.y.r["incidence"])
km_inc = kmeans(m_inc_temp.y.r["incidence"], centers = 4, nstart = 10)
s_inc = silhouette(km_inc$cluster,D_inc)
plot(s_inc)
fviz_nbclust(m_inc_temp.y.r["incidence"], pam, method = "silhouette")+ theme_classic()
#find optimal number of clusters by temperature
D_temp = dist(m_inc_temp.y.r["temperature"])
km_temp = kmeans(m_inc_temp.y.r["temperature"], centers = 4, nstart = 10)
s_temp = silhouette(km_temp$cluster,D_temp)
plot(s_temp)
fviz_nbclust(m_inc_temp.y.r["temperature"], pam, method = "silhouette")+ theme_classic()
#find optimal number of clusters by temperature and incidence
D_inc_temp = dist(scale(m_inc_temp.y.r[,c("incidence","temperature")]))
km_inc_temp = kmeans(scale(m_inc_temp.y.r[,c("incidence","temperature")]), centers = 3)
s_inc_temp = silhouette(km_inc_temp$cluster,D_inc_temp)
plot(s_inc_temp)
fviz_nbclust(scale(m_inc_temp.y.r[,c("incidence","temperature")]), pam, method = "silhouette")+ theme_classic()
m_inc_temp_cluster.y.r = data.frame(m_inc_temp.y.r, km_inc_temp$cluster)
plot1 = ggplot(m_inc_temp.y.r, aes(x = temperature, y = incidence, color = as.factor(km_inc$cluster))) +
geom_point() +
labs(title = "Partitioning Clustering Plot by incidence") +
theme_classic()
plot2 = ggplot(m_inc_temp.y.r, aes(x = temperature, y = incidence, color = as.factor(km_temp$cluster))) +
geom_point() +
labs(title = "Partitioning Clustering Plot by temperature") +
theme_classic()
plot2
plot3 = ggplot(m_inc_temp.y.r, aes(x = temperature, y = incidence, color = as.factor(km_inc_temp$cluster))) +
geom_point() +
labs(title = "Partitioning Clustering Plot") +
theme_classic()
plot3
gridExtra::grid.arrange(plot1, plot2, plot3, ncol = 1)
#probieren mit time series Objekten
my_ts <- ts(data_total$dengue_cases, start = c(data_total$time_column[1]), frequency = 12)
head(my_ts)
## [1] NA NA 1 12 29 44
TSstudio::ts_plot(my_ts)
#GAM
library(readxl)
library(nlme)
## Warning: Paket 'nlme' wurde unter R Version 4.3.1 erstellt
##
## Attache Paket: 'nlme'
## Das folgende Objekt ist maskiert 'package:dplyr':
##
## collapse
library(gam)
## Warning: Paket 'gam' wurde unter R Version 4.3.1 erstellt
## Lade nötiges Paket: splines
## Lade nötiges Paket: foreach
## Warning: Paket 'foreach' wurde unter R Version 4.3.1 erstellt
## Loaded gam 1.22-2
library(sdm)
## Warning: Paket 'sdm' wurde unter R Version 4.3.1 erstellt
## Lade nötiges Paket: sp
## Warning: Paket 'sp' wurde unter R Version 4.3.1 erstellt
## The legacy packages maptools, rgdal, and rgeos, underpinning this package
## will retire shortly. Please refer to R-spatial evolution reports on
## https://r-spatial.org/r/2023/05/15/evolution4.html for details.
## This package is now running under evolution status 0
## sdm 1.1-8 (2021-11-11)
library(raster)
## Warning: Paket 'raster' wurde unter R Version 4.3.1 erstellt
##
## Attache Paket: 'raster'
## Das folgende Objekt ist maskiert 'package:nlme':
##
## getData
## Das folgende Objekt ist maskiert 'package:dplyr':
##
## select
library(ncdf4)
library(mgcv)
## Warning: Paket 'mgcv' wurde unter R Version 4.3.1 erstellt
## This is mgcv 1.8-42. For overview type 'help("mgcv-package")'.
##
## Attache Paket: 'mgcv'
## Die folgenden Objekte sind maskiert von 'package:gam':
##
## gam, gam.control, gam.fit, s
attach(data_total)
## Die folgenden Objekte sind maskiert durch .GlobalEnv:
##
## data_names, month, time_column, year
model1 = gam(incidence~s(temperature, k = 16), family = "quasipoisson", na.rm = T)
plot.gam(model1)
summary(model1) #edf: → a value close to 1 tends to be close to a linear term → a higher value means that the spline is more wiggly (non-linear).
##
## Family: quasipoisson
## Link function: log
##
## Formula:
## incidence ~ s(temperature, k = 16)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.36386 0.01243 190.2 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(temperature) 13.82 14.76 42.97 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0465 Deviance explained = 9.31%
## GCV = 11.771 Scale est. = 19.981 n = 13607
#F-Wert und der p-Wert geben die statistische Signifikanz der glatten Funktion an. In diesem Fall ist die glatte Funktion von "temperature" hoch signifikant (p-Wert < 2e-16), was darauf hindeutet, dass sie einen signifikanten Einfluss auf die Vorhersage der "incidence" hat.Der angegebene R-squared-Wert (R-sq.(adj)) von 0.0407 gibt an, wie gut das Modell die Varianz der abhängigen Variable erklärt. In diesem Fall erklärt das Modell etwa 4,07% der Varianz in den Daten. Ein niedriger GCV-Wert deutet auf eine gute Anpassung des Modells hin.
k.check(model1)
## k' edf k-index p-value
## s(temperature) 15 13.81749 0.9896895 0.8025
gam.check(model1)
##
## Method: GCV Optimizer: outer newton
## full convergence after 4 iterations.
## Gradient range [2.609013e-05,2.609013e-05]
## (score 11.77115 & scale 19.98141).
## Hessian positive definite, eigenvalue range [0.001424269,0.001424269].
## Model rank = 16 / 16
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(temperature) 15.0 13.8 0.98 0.66
best_aic = Inf
best_model = NULL
k_best = 0
for (i in 3:50){
# Calculate AIC
smooth_model = gam(incidence~s(temperature, k = i))
aic = AIC(smooth_model)
# Check if the current model has lower AIC than the best model so far
if (aic < best_aic) {
best_aic = aic
best_model = smooth_model
k_best = i
}
}
best_aic
## [1] 112947.1
k_best
## [1] 16
smooth_model = gam(incidence~s(temperature, k = 16))
linear_model = gam(incidence~temperature, k = 16)
AIC(smooth_model, linear_model)
## df AIC
## smooth_model 14.88464 112947.1
## linear_model 3.00000 113361.2
detach(data_total)
library(raster)
library(gam)
library(sdm)
library(readxl)
library(mgcv)
library(ncdf4)
## lower_limit = quantile(data_monsoon$temperature, probs = 0.05, na.rm = T)
#upper_limit = quantile(data_monsoon$temperature, probs = 0.95, na.rm = T)
#data_monsoon_5.95 = subset(data_monsoon, temperature >= lower_limit & temperature <= upper_limit)
attach(data_total)
## Die folgenden Objekte sind maskiert durch .GlobalEnv:
##
## data_names, month, time_column, year
model2 <- gam(incidence~s(temperature, k = 16), family = "quasipoisson")
tas <- stack("tas_SEA22_MPI_rcp85_2021-2040_grid_subc_daymean_monmean_timmean.nc")
#tas_test = crop(tas, my.area)
names(tas) <- "temperature"
p1 <- raster::predict(tas, model2, type = "response")
thailand <- raster::getData('GADM', country='THA', level=1)
## Warning in raster::getData("GADM", country = "THA", level = 1): getData will be removed in a future version of raster
## . Please use the geodata package instead
my.area <- extent(thailand)
my.area = extent(95,107,5.5,20.5)
my.p1 <- crop(p1, my.area)
my.p1.mask <- mask(my.p1, thailand)
plot(my.p1.mask, legend.width=2, legend.shrink=0.5, axes= F, box=F, zlim = c(0,22.8)) + plot(thailand, add=TRUE)
## integer(0)
detach(data_total)
#Plotting der Inzidenz eines einzelnen Monats auf die Karte von Thailand
library(sf)
## Warning: Paket 'sf' wurde unter R Version 4.3.1 erstellt
## Linking to GEOS 3.11.2, GDAL 3.6.2, PROJ 9.2.0; sf_use_s2() is TRUE
library(ggplot2)
district_sf <- st_read("gadm36_THA_shp/gadm36_THA_1.shp")
## Reading layer `gadm36_THA_1' from data source
## `C:\Users\thadl\OneDrive\Dokumente\GitHub\topic04_team04\gadm36_THA_shp\gadm36_THA_1.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 77 features and 10 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 97.34519 ymin: 5.616042 xmax: 105.6391 ymax: 20.46321
## Geodetic CRS: WGS 84
#Inzidenz vom April 2006 auf die karte geplottet
#district_sf$incidence2006Aug <- rows_2006Aug$incidence
#ggplot() +
# geom_sf(data = district_sf, aes(fill = incidence2006Aug)) +
# scale_fill_gradient(low = "yellow", high = "red") + labs(fill = "Incidence in Aug 2006")
#Funktion für Inzidenz plot der Monate
library(sf)
library(ggplot2)
district_sf <- st_read("gadm36_THA_shp/gadm36_THA_1.shp")
map_incidence_monthly = function(incidence_month){
max_incidence = which.max(incidence_month$incidence)
max_incidence_area = incidence_month$Reporting_areas[max_incidence]
ggplot() +
geom_sf(data = district_sf, aes(fill = incidence_month$incidence)) +
scale_fill_gradient(low = "white", high = "green") +
labs(fill = paste("Highest incidence =", max_incidence, "in", max_incidence_area))
}
#input for incidence month in format: rows_YYYY_Mon
map_temp_monthly = function(temp_month){
max_temp = which.max(temp_month$temperature)
max_temp_area = temp_month$Reporting_areas[max_temp]
ggplot() +
geom_sf(data = district_sf, aes(fill = temp_month$temperature)) +
scale_fill_gradient(low = "white", high = "green") +
labs(fill = paste("Highest temperature =", max_temp, "in", max_temp_area))
}
##map_incidence_monthly(f_mean_inc.y.p(period = c(2006:2020), areas = r_areas))
##gridExtra::grid.arrange(map_temp_monthly(f_mean_temp.y.p(period = c(2020), areas = r_areas)),map_incidence_monthly(f_mean_inc.y.p(period = c(2020), areas = r_areas)))
#average incidence 2006 to 2020 map
library(ggplot2)
library(sf)
f_map_mean_inc.y.p = function(period, areas){
map_mean_inc.y.p <- data.frame(incidence = numeric(), time_column = as.Date(character()), Reporting_areas = character())
for (i in period){
year = map_data_total[map_data_total$year == i,]
for (p in areas) {
provinz = year[year$Reporting_areas == p,]
m_y_p = data.frame(incidence = mean(provinz$incidence, na.rm = TRUE), time_column = provinz$time_column[1], Reporting_areas = provinz$Reporting_areas[1])
map_mean_inc.y.p = rbind(map_mean_inc.y.p, m_y_p)
}
}
return(map_mean_inc.y.p)
}
f_map_mean_inc.p = function(areas){
map_mean_inc.p <- data.frame(time_column = as.Date(character()), Reporting_areas = character(), incidence = numeric())
for (p in areas) {
provinz = map_data_total[map_data_total$Reporting_areas == p,]
m_p = data.frame(time_column = provinz$time_column[1], Reporting_areas = provinz$Reporting_areas[1],incidence = mean(provinz$incidence, na.rm = TRUE))
map_mean_inc.p = rbind(map_mean_inc.p, m_p)
}
return(map_mean_inc.p)
}
r_areas2 = unique(data_total$Reporting_areas)
mean_inc.p <- data.frame(time_column = as.Date(character()), Reporting_areas = character(), incidence = numeric())
for (p in r_areas2) {
provinz = data_total[data_total$Reporting_areas == p,]
m_p = data.frame(time_column = provinz$time_column[1], Reporting_areas = provinz$Reporting_areas[1],incidence = mean(provinz$incidence, na.rm = TRUE))
mean_inc.p = rbind(mean_inc.p, m_p)
}
map_mean_inc.p = f_map_mean_inc.p(areas = unique(map_data_total$Reporting_areas))
district_sf = st_read("gadm36_THA_shp/gadm36_THA_1.shp")
## Reading layer `gadm36_THA_1' from data source
## `C:\Users\thadl\OneDrive\Dokumente\GitHub\topic04_team04\gadm36_THA_shp\gadm36_THA_1.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 77 features and 10 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 97.34519 ymin: 5.616042 xmax: 105.6391 ymax: 20.46321
## Geodetic CRS: WGS 84
district_sf$incidence <- map_mean_inc.p$incidence
ggplot2::ggplot() +
geom_sf(data = district_sf, aes(fill = incidence)) +
scale_fill_gradientn(colors = rev(terrain.colors(100)), limits = c(0,22.8)) + labs(fill = "Incidence") + labs(title = "Average incidences for 2006-2020 ")